Import Libraries

library("tidyverse")
library("dplyr")

Read dataset_1 - Note: Removed first column for ID values

dataset_1 <- read.csv("../raw_data/dataset_1/winemag-data-130k-v2.csv")[-1]

Combine all the csv files for dataset_2

# function reads csv values
readfile <- function(filename){
  cat("Reading file: ", filename, "...\n", sep = '' )
  return(read.csv(file = filename, header = TRUE))
}

# function merges two dataframes together
merge_dataframes <- function(dataframe_1, dataframe_2){
  return(rbind(dataframe_1, dataframe_2))
}

# funciton merges all csv files in folder to one dataframe
combine_data = function(mypath){
  filenames=list.files(path=mypath, full.names=TRUE)
  datalist = lapply(filenames, function(x) readfile(x))
  return(Reduce(f = function(x,y) merge_dataframes(x,y), x = datalist, accumulate = FALSE))
}

dataset_2 <- combine_data("../raw_data/dataset_2")
Reading file: ../raw_data/dataset_2/winemag-1-800.csv...
Reading file: ../raw_data/dataset_2/winemag-10401-11200.csv...
Reading file: ../raw_data/dataset_2/winemag-11201-12000.csv...
Reading file: ../raw_data/dataset_2/winemag-12001-12800.csv...
Reading file: ../raw_data/dataset_2/winemag-1601-2400.csv...
Reading file: ../raw_data/dataset_2/winemag-2401-3200.csv...
Reading file: ../raw_data/dataset_2/winemag-3201-4000.csv...
Reading file: ../raw_data/dataset_2/winemag-4001-4800.csv...
Reading file: ../raw_data/dataset_2/winemag-4801-5600.csv...
Reading file: ../raw_data/dataset_2/winemag-5601-6400.csv...
Reading file: ../raw_data/dataset_2/winemag-6401-7200.csv...
Reading file: ../raw_data/dataset_2/winemag-7201-8000.csv...
Reading file: ../raw_data/dataset_2/winemag-8001-8800.csv...
Reading file: ../raw_data/dataset_2/winemag-801-1600.csv...
Reading file: ../raw_data/dataset_2/winemag-8801-9600.csv...
Reading file: ../raw_data/dataset_2/winemag-9601-10400.csv...

Join datasets by title, description, price, country, points = rating

join_wines <- inner_join(dataset_1, dataset_2, by = c("title", "description", "points" = "rating", "price", "country"))
Column `title` joining factors with different levels, coercing to character vectorColumn `description` joining factors with different levels, coercing to character vectorColumn `country` joining factors with different levels, coercing to character vector

Visualize Joined data

head(join_wines)

Output uncleaned joined data for others to use

write_csv(join_wines, "joined_datasets.csv")

Clean joined data to remove redundancies

join_wines_cleaned <- join_wines %>%
  select(country, description, designation=designation.x, points, price, province, region=region_1, subregion=region_2, taster_name, taster_twitter_handle, title, variety, winery=winery.x, alcohol, category, url, vintage)
head(join_wines_cleaned)

Outout cleaned joined data for others to use

write_csv(join_wines_cleaned, "joined_datasets_cleaned.csv")

Extract relevant information for our use

wines <- join_wines_cleaned %>%
  select(title, alcohol, category, vintage, designation, country, province, region, subregion,  variety, winery, price, points, taster_name, taster_twitter_handle)

Output for easy import later

write_csv(wines, "wines.csv")
LS0tCnRpdGxlOiAiUHJlcHJjZXNzaW5nIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCkltcG9ydCBMaWJyYXJpZXMKYGBge3J9CmxpYnJhcnkoInRpZHl2ZXJzZSIpCmxpYnJhcnkoImRwbHlyIikKYGBgCgpSZWFkIGRhdGFzZXRfMSAtIE5vdGU6IFJlbW92ZWQgZmlyc3QgY29sdW1uIGZvciBJRCB2YWx1ZXMKYGBge3J9CmRhdGFzZXRfMSA8LSByZWFkLmNzdigiLi4vcmF3X2RhdGEvZGF0YXNldF8xL3dpbmVtYWctZGF0YS0xMzBrLXYyLmNzdiIpWy0xXQpgYGAKCkNvbWJpbmUgYWxsIHRoZSBjc3YgZmlsZXMgZm9yIGRhdGFzZXRfMgpgYGB7cn0KIyBmdW5jdGlvbiByZWFkcyBjc3YgdmFsdWVzCnJlYWRmaWxlIDwtIGZ1bmN0aW9uKGZpbGVuYW1lKXsKICBjYXQoIlJlYWRpbmcgZmlsZTogIiwgZmlsZW5hbWUsICIuLi5cbiIsIHNlcCA9ICcnICkKICByZXR1cm4ocmVhZC5jc3YoZmlsZSA9IGZpbGVuYW1lLCBoZWFkZXIgPSBUUlVFKSkKfQoKIyBmdW5jdGlvbiBtZXJnZXMgdHdvIGRhdGFmcmFtZXMgdG9nZXRoZXIKbWVyZ2VfZGF0YWZyYW1lcyA8LSBmdW5jdGlvbihkYXRhZnJhbWVfMSwgZGF0YWZyYW1lXzIpewogIHJldHVybihyYmluZChkYXRhZnJhbWVfMSwgZGF0YWZyYW1lXzIpKQp9CgojIGZ1bmNpdG9uIG1lcmdlcyBhbGwgY3N2IGZpbGVzIGluIGZvbGRlciB0byBvbmUgZGF0YWZyYW1lCmNvbWJpbmVfZGF0YSA9IGZ1bmN0aW9uKG15cGF0aCl7CiAgZmlsZW5hbWVzPWxpc3QuZmlsZXMocGF0aD1teXBhdGgsIGZ1bGwubmFtZXM9VFJVRSkKICBkYXRhbGlzdCA9IGxhcHBseShmaWxlbmFtZXMsIGZ1bmN0aW9uKHgpIHJlYWRmaWxlKHgpKQogIHJldHVybihSZWR1Y2UoZiA9IGZ1bmN0aW9uKHgseSkgbWVyZ2VfZGF0YWZyYW1lcyh4LHkpLCB4ID0gZGF0YWxpc3QsIGFjY3VtdWxhdGUgPSBGQUxTRSkpCn0KCmRhdGFzZXRfMiA8LSBjb21iaW5lX2RhdGEoIi4uL3Jhd19kYXRhL2RhdGFzZXRfMiIpCmBgYAoKSm9pbiBkYXRhc2V0cyBieSBgdGl0bGVgLCAgYGRlc2NyaXB0aW9uYCwgYHByaWNlYCwgYGNvdW50cnlgLCBgcG9pbnRzYCA9IGByYXRpbmdgCmBgYHtyfQpqb2luX3dpbmVzIDwtIGlubmVyX2pvaW4oZGF0YXNldF8xLCBkYXRhc2V0XzIsIGJ5ID0gYygidGl0bGUiLCAiZGVzY3JpcHRpb24iLCAicG9pbnRzIiA9ICJyYXRpbmciLCAicHJpY2UiLCAiY291bnRyeSIpKQpgYGAKClZpc3VhbGl6ZSBKb2luZWQgZGF0YQpgYGB7cn0KaGVhZChqb2luX3dpbmVzKQpgYGAKCk91dHB1dCB1bmNsZWFuZWQgam9pbmVkIGRhdGEgZm9yIG90aGVycyB0byB1c2UKYGBge3J9CndyaXRlX2Nzdihqb2luX3dpbmVzLCAiam9pbmVkX2RhdGFzZXRzLmNzdiIpCmBgYAoKQ2xlYW4gam9pbmVkIGRhdGEgdG8gcmVtb3ZlIHJlZHVuZGFuY2llcwpgYGB7cn0Kam9pbl93aW5lc19jbGVhbmVkIDwtIGpvaW5fd2luZXMgJT4lCiAgc2VsZWN0KGNvdW50cnksIGRlc2NyaXB0aW9uLCBkZXNpZ25hdGlvbj1kZXNpZ25hdGlvbi54LCBwb2ludHMsIHByaWNlLCBwcm92aW5jZSwgcmVnaW9uPXJlZ2lvbl8xLCBzdWJyZWdpb249cmVnaW9uXzIsIHRhc3Rlcl9uYW1lLCB0YXN0ZXJfdHdpdHRlcl9oYW5kbGUsIHRpdGxlLCB2YXJpZXR5LCB3aW5lcnk9d2luZXJ5LngsIGFsY29ob2wsIGNhdGVnb3J5LCB1cmwsIHZpbnRhZ2UpCmhlYWQoam9pbl93aW5lc19jbGVhbmVkKQpgYGAKCk91dG91dCBjbGVhbmVkIGpvaW5lZCBkYXRhIGZvciBvdGhlcnMgdG8gdXNlCmBgYHtyfQp3cml0ZV9jc3Yoam9pbl93aW5lc19jbGVhbmVkLCAiam9pbmVkX2RhdGFzZXRzX2NsZWFuZWQuY3N2IikKYGBgCgpFeHRyYWN0IHJlbGV2YW50IGluZm9ybWF0aW9uIGZvciBvdXIgdXNlCmBgYHtyfQp3aW5lcyA8LSBqb2luX3dpbmVzX2NsZWFuZWQgJT4lCiAgc2VsZWN0KHRpdGxlLCBhbGNvaG9sLCBjYXRlZ29yeSwgdmludGFnZSwgZGVzaWduYXRpb24sIGNvdW50cnksIHByb3ZpbmNlLCByZWdpb24sIHN1YnJlZ2lvbiwgIHZhcmlldHksIHdpbmVyeSwgcHJpY2UsIHBvaW50cywgdGFzdGVyX25hbWUsIHRhc3Rlcl90d2l0dGVyX2hhbmRsZSkKYGBgCgpPdXRwdXQgZm9yIGVhc3kgaW1wb3J0IGxhdGVyCmBgYHtyfQp3cml0ZV9jc3Yod2luZXMsICJ3aW5lcy5jc3YiKQpgYGAKCgoK